########################################################
## PROGRAM NAME: 005_int_stats.R                      ##
## AUTHOR: MATT MLECZKO                               ##
## DATE CREATED: 05/05/2021                           ##
## INPUTS:                                            ##
##    002_af_wide_final.Rda                           ##
##                                                    ##
## OUTPUTS:                                           ##
##                                                    ## 
## PURPOSE: General integration trends,               ##
##          in addition to HHF analyses;              ##
##          stats for Table 4                         ##
##                                                    ##
## LIST OF UPDATES:                                   ##
##                                                    ##
########################################################

log <- file("005_int_stats.txt") 
sink(log, append=TRUE)
sink(log, append=TRUE, type="message")

## load libraries ##

library(tidycensus)
library(tigris)
library(tidyverse)
library(foreign)
library(stringr)
library(tm)
library(gdata)
library(readxl)
library(sf)
library(haven)
library(data.table)
library(ggridges)
library(data.table)

## define paths ##
data_path <- "PATH TO DATA HERE"
progs <- "PATH TO PROGRAMS HERE"

## set working directory ##
setwd(data_path)

##
## load necessary functions ## 
##

`%notin%` <- Negate(`%in%`)
source(paste0(progs,"stata_merge.R"))

## load the analytic file ##
load("002_af_wide_final.Rda")

## calculate number of tracts ##
num.tracts <- af.wide.final %>%
  group_by(CBSA) %>%
  summarize(tracts = n())

##
## decomposition of within metro D ##
##

load("001_trt_t_pl.Rda")
load("001_2020_Census_del.Rda")

##
## limit to tracts in metro areas ##
##

paste("IS THE TRACT TO PLACE FILE UNIQUE BY FIPS?")
length(unique(tract.to.pl$FIPS)) == nrow(tract.to.pl)
paste("IS THE CENSUS DELINEATION FILE UNIQUE BY FIPS?")
length(unique(census.del.2020.final$FIPS)) == nrow(census.del.2020.final)

tract.to.pl.metro.m <- stata.merge(tract.to.pl,
                                   census.del.2020.final,
                                   "FIPS")

## check merge ##
paste("MERGE BETWEEN TRACT TO PLACE XWALK AND CENSUS DELINEATION FILE")
paste("THE ONE NON-MATCHING OBS (=1) IS PLYMOUTH COUNTY, IA")
table(tract.to.pl.metro.m$merge.variable)

## keep valid matches ##
cc.tracts <- tract.to.pl.metro.m %>%
  filter(merge.variable == 3 & 
         `Metropolitan/Micropolitan Statistical Area` !="Micropolitan Statistical Area") %>%
  select(GEOID,
         afact_num) %>%
  group_by(GEOID) %>%
  summarize(afact_num = sum(afact_num, na.rm=T))

## checks ##
paste("IS THE CC FILE UNIQUE BY GEOID")
nrow(cc.tracts) == length(unique(cc.tracts$GEOID))
paste("SUMMARY OF AFACT")
summary(cc.tracts$afact_num)

## merge back to analytic file ##
af.wide.final.wua.m <- stata.merge(af.wide.final,
                                   cc.tracts,
                                   "GEOID")

## check merge ## 
paste("MERGE BETWEEN CC INDICATOR FILE AND ANALYTIC FILE")
table(af.wide.final.wua.m$merge.variable, useNA = "ifany")

## create ua indicator ##
af.wide.final.wua <- af.wide.final.wua.m %>%
  filter(merge.variable %in% c(1,3)) %>%
  mutate(cc = ifelse(merge.variable == 3, 1, 0),
         pop_total_2000_cc = case_when(cc == 1 & afact_num == 1 ~ pop_total_2000,
                                       cc == 1 & afact_num < 1 ~ pop_total_2000*afact_num,
                                       cc == 0 ~ 0),
         pop_total_2000_notcc = case_when(cc == 1 & afact_num == 1 ~ 0,
                                          cc == 1 & afact_num < 1 ~ pop_total_2000*(1-afact_num),
                                          cc == 0 ~ pop_total_2000),
         pop_total_2010_cc = case_when(cc == 1 & afact_num == 1 ~ pop_total_2010,
                                       cc == 1 & afact_num < 1 ~ pop_total_2010*afact_num,
                                       cc == 0 ~ 0),
         pop_total_2010_notcc = case_when(cc == 1 & afact_num == 1 ~ 0,
                                          cc == 1 & afact_num < 1 ~ pop_total_2010*(1-afact_num),
                                          cc == 0 ~ pop_total_2010),
         pop_total_2020_cc = case_when(cc == 1 & afact_num == 1 ~ pop_total_2020,
                                       cc == 1 & afact_num < 1 ~ pop_total_2020*afact_num,
                                       cc == 0 ~ 0),
         pop_total_2020_notcc = case_when(cc == 1 & afact_num == 1 ~ 0,
                                          cc == 1 & afact_num < 1 ~ pop_total_2020*(1-afact_num),
                                          cc == 0 ~ pop_total_2020)) %>%
  select(-merge.variable)

## breakdown of central city indicator ##
paste("BREAKDOWN OF CC INDICATOR")
table(af.wide.final.wua$cc, useNA = "ifany")

## quick check to see that the central city and suburban populations add back to the total ##
popcheck <- af.wide.final.wua %>%
  select(GEOID,
         CBSA,
         cc,
         afact_num,
         pop_total_2000,
         pop_total_2000_cc,
         pop_total_2000_notcc,
         pop_total_2010,
         pop_total_2010_cc,
         pop_total_2010_notcc,
         pop_total_2020,
         pop_total_2020_cc,
         pop_total_2020_notcc) %>%
  mutate(pop_all_2000 = pop_total_2000_cc + pop_total_2000_notcc,
         pop_all_2010 = pop_total_2010_cc + pop_total_2010_notcc,
         pop_all_2020 = pop_total_2020_cc + pop_total_2020_notcc)

paste("DOES CC AND SUB POP EQUAL TOTAL POP IN 2000, 2010, AND 2020?")
sum(round(popcheck$pop_all_2000,2) != round(popcheck$pop_total_2000,2))
sum(round(popcheck$pop_all_2010,2) != round(popcheck$pop_total_2010,2))
sum(round(popcheck$pop_all_2020,2) != round(popcheck$pop_total_2020,2))


## first, analysis of where integration is happening ##
## within and across metros ##

##
## regions ##
##

af.wide.final$sfip_2000_use <- str_pad(af.wide.final$sfips_2000, 2, "left","0")
af.dtable <- as.data.table(af.wide.final)

af.dtable[, region := fcase(
  sfip_2000_use %in% c("09","23","25","44","50","34","36","42","33"), "NE",
  sfip_2000_use %in% c("17","18","26","39","55","19","20","27","29","31","38","46"), "MW",
  sfip_2000_use %in% c("10","11","12","13","24","37","45","51","54","01","21","28","47","05","22","40","48"), "SO",
  sfip_2000_use %in% c("08","56","16","30","35","06","15","53","02","49","32","41","04"), "WE"
)]

#table(af.dtable$sfip_2000_use, af.dtable$region, useNA = "ifany")
  
regions <- af.dtable %>%
  group_by(region) %>%
  summarize(tracts = n(),
            int_2000 = sum(int_jt_relf_a_2000,na.rm=T),
            int_2010 = sum(int_jt_relf_a_2010,na.rm=T),
            int_2020 = sum(int_jt_relf_a_2020,na.rm=T)) %>%
  mutate(pcint2000 = int_2000/tracts,
         pcint2010 = int_2010/tracts,
         pcint2020 = int_2020/tracts,
         ch_0010 = int_2010 - int_2000,
         ch_1020 = int_2020 - int_2010,
         ch_0020 = int_2020 - int_2000,
         pctch_0010 = (int_2010 - int_2000)/int_2000,
         pctch_1020 = (int_2020 - int_2010)/int_2010,
         pctch_0020 = (int_2020 - int_2000)/int_2000)
         
##
## msas ## 
##

paste("CROSSTABS WITH CC AND INT ER REL IN 2000, 2010, AND 2020")
table(af.wide.final.wua$cc, af.wide.final.wua$int_er_rel_2000)
table(af.wide.final.wua$cc, af.wide.final.wua$int_er_rel_2010)
table(af.wide.final.wua$cc, af.wide.final.wua$int_er_rel_2020)

cbsa.names <- census.del.2020.final %>%
  rename(cbsa_name = `CBSA Title`,
         CBSA = `CBSA Code`) %>%
  select(CBSA,
         cbsa_name) %>%
  unique()

##################
## msa analyses ##
##################

## 
## overall integration stats
##

ccvssub.int.ov <- af.wide.final.wua %>%
  group_by(CBSA) %>%
  summarize(tracts = n(),
            int_2000 = sum(int_jt_relf_a_2000,na.rm=T),
            int_2010 = sum(int_jt_relf_a_2010,na.rm=T),
            int_2020 = sum(int_jt_relf_a_2020,na.rm=T)) %>%
  mutate(pcint2000 = int_2000/tracts,
         pcint2010 = int_2010/tracts,
         pcint2020 = int_2020/tracts) %>%
  left_join(cbsa.names, "CBSA") %>%
  select(CBSA,
         cbsa_name,
         everything()) 

#summary(ccvssub.int.ov$tracts)

## 
## overall integration stats among msas with above-average # tracts
##

ccvssub.int.ov.aa <- af.wide.final.wua %>%
  group_by(CBSA) %>%
  summarize(tracts = n(),
            int_2000 = sum(int_jt_relf_a_2000,na.rm=T),
            int_2010 = sum(int_jt_relf_a_2010,na.rm=T),
            int_2020 = sum(int_jt_relf_a_2020,na.rm=T)) %>%
  mutate(pcint2000 = int_2000/tracts,
         pcint2010 = int_2010/tracts,
         pcint2020 = int_2020/tracts) %>%
  left_join(cbsa.names, "CBSA") %>%
  select(CBSA,
         cbsa_name,
         everything()) %>%
  filter(tracts >= 154)

## 
## integration stats across cc and sub tracts
##

ccvssub.int <- af.wide.final.wua %>%
  group_by(CBSA,cc) %>%
  summarize(tracts = n(),
            int_2000 = sum(int_jt_relf_a_2000,na.rm=T),
            int_2010 = sum(int_jt_relf_a_2010,na.rm=T),
            int_2020 = sum(int_jt_relf_a_2020,na.rm=T)) %>%
  mutate(cc_txt = ifelse(cc == 1, "cc", "sub"),
         pcint2000 = int_2000/tracts,
         pcint2010 = int_2010/tracts,
         pcint2020 = int_2020/tracts) %>%
  select(-cc) %>%
  pivot_wider(names_from = cc_txt, values_from = c(tracts:pcint2020)) %>%
  mutate(diff00 = pcint2000_cc - pcint2000_sub,
         diff10 = pcint2010_cc - pcint2010_sub,
         diff20 = pcint2020_cc - pcint2020_sub) %>%
  left_join(cbsa.names, "CBSA") %>%
  select(CBSA,
         cbsa_name,
         everything())

paste("INTEGRATION STATS ACROSS CC AND SUB TRACTS (2000-2020)")
summary(ccvssub.int)


##
## now do the same, except with abs measures ## 
##

ccvssub.int <- af.wide.final.wua %>%
  group_by(CBSA,cc) %>%
  summarize(tracts = n(),
            int_2000 = sum(int_jt_abs_2000,na.rm=T),
            int_2010 = sum(int_jt_abs_2010,na.rm=T),
            int_2020 = sum(int_jt_abs_2020,na.rm=T)) %>%
  mutate(cc_txt = ifelse(cc == 1, "cc", "sub"),
         pcint2000 = int_2000/tracts,
         pcint2010 = int_2010/tracts,
         pcint2020 = int_2020/tracts) %>%
  select(-cc) %>%
  pivot_wider(names_from = cc_txt, values_from = c(tracts:pcint2020)) %>%
  mutate(diff00 = pcint2000_cc - pcint2000_sub,
         diff10 = pcint2010_cc - pcint2010_sub,
         diff20 = pcint2020_cc - pcint2020_sub)

paste("ABS INT STATS ACROSS CC AND SUB TRACTS (2000-2020)")
summary(ccvssub.int)

##
## finally, do this for 1980 - 2000 ## 
##

load("003_8090_trts.Rda")

all.decades.m <- stata.merge(metro.tracts.8090.fin,
                             af.wide.final, 
                             "GEOID")

## check merge ##
paste("MERGE BETWEEN 8090 TRACT FILE AND ANALYTIC FILE")
table(all.decades.m$merge.variable, useNA = "ifany")

## keep matches ##
all.decades <- all.decades.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable)

af.ad.m <- stata.merge(all.decades,
                       cc.tracts,
                       "GEOID")

## check merge ## 
paste("MERGE BETWEEN ALL DECADES FILE AND CC INDICATOR FILE")
table(af.ad.m$merge.variable, useNA = "ifany")

## create ua indicator ##
af.ad <- af.ad.m %>%
  filter(merge.variable %in% c(1,3)) %>%
  mutate(cc = ifelse(merge.variable == 3, 1, 0))

##
## msa analyses dating back to 1980
##

ccvssub.int.8000 <- af.ad %>%
  group_by(CBSA,cc) %>%
  summarize(tracts = n(),
            int_1980 = sum(int_er_abs_1980,na.rm=T),
            int_1990 = sum(int_er_abs_1990,na.rm=T),
            int_2000 = sum(int_er_abs_2000,na.rm=T)) %>%
  mutate(cc_txt = ifelse(cc == 1, "cc", "sub"),
         pcint1980 = int_1980/tracts,
         pcint1990 = int_1990/tracts,
         pcint2000 = int_2000/tracts) %>%
  select(-cc) %>%
  pivot_wider(names_from = cc_txt, values_from = c(tracts:pcint2000)) %>%
  mutate(diff80 = pcint1980_cc - pcint1980_sub,
         diff90 = pcint1990_cc - pcint1990_sub,
         diff00 = pcint2000_cc - pcint2000_sub)

paste("INTEGRATION STATS ACROSS CC AND SUB TRACTS (1980-2000)")
summary(ccvssub.int.8000)

##################
## HHF ANALYSIS ##
##################

paste("INT NEIGHBORHOODS OVERALL 2000, 2010, and 2020")
table(af.wide.final$int_jt_relf_a_2000, useNA = "ifany")
table(af.wide.final$int_jt_relf_a_2010, useNA = "ifany")

paste("% GROWTH FROM 2000 - 2010")
(table(af.wide.final$int_jt_relf_a_2010, useNA = "ifany")[2]-
 table(af.wide.final$int_jt_relf_a_2000, useNA = "ifany")[2])/
  table(af.wide.final$int_jt_relf_a_2000, useNA = "ifany")[2]
table(af.wide.final$int_jt_relf_a_2020, useNA = "ifany")

paste("% GROWTH FROM 2010 - 2020")
(table(af.wide.final$int_jt_relf_a_2020, useNA = "ifany")[2]-
    table(af.wide.final$int_jt_relf_a_2010, useNA = "ifany")[2])/
  table(af.wide.final$int_jt_relf_a_2010, useNA = "ifany")[2]

paste("INT NEIGHBORHOODS SHARES OVERALL 2000, 2010, and 2020")
prop.table(table(af.wide.final$int_jt_relf_a_2000, useNA = "ifany"))
prop.table(table(af.wide.final$int_jt_relf_a_2010, useNA = "ifany"))
prop.table(table(af.wide.final$int_jt_relf_a_2020, useNA = "ifany"))

## 
## stratify by hhf
##

hhf <- af.wide.final %>%
  filter(substr(GEOID,1,2) %in% c("01","04","06","12","13",
                                  "17","18","21","26","28",
                                  "32","34","37","39","41",
                                  "44","45","47","11"))

paste("INT JT RELFA SHARES IN HHF TRACTS 2000, 2010, AND 2020")
prop.table(table(hhf$int_jt_relf_a_2000, useNA = "ifany"))
prop.table(table(hhf$int_jt_relf_a_2010, useNA = "ifany"))
prop.table(table(hhf$int_jt_relf_a_2020, useNA = "ifany"))

paste("INT JT RELF A IN HHF TRACTS 2000, 2010, AND 2020")
table(hhf$int_jt_relf_a_2000, useNA = "ifany")
table(hhf$int_jt_relf_a_2010, useNA = "ifany")

paste("% GROWTH FROM 2000 - 2010")
(table(hhf$int_jt_relf_a_2010, useNA = "ifany")[2]-
    table(hhf$int_jt_relf_a_2000, useNA = "ifany")[2])/
  table(hhf$int_jt_relf_a_2000, useNA = "ifany")[2]

table(hhf$int_jt_relf_a_2020, useNA = "ifany")

paste("% GROWTH FROM 2010 - 2020")
(table(hhf$int_jt_relf_a_2020, useNA = "ifany")[2]-
    table(hhf$int_jt_relf_a_2010, useNA = "ifany")[2])/
  table(hhf$int_jt_relf_a_2010, useNA = "ifany")[2]

non.hhf <- af.wide.final %>%
  filter(GEOID %notin% hhf$GEOID)

paste("INT JT RELFA SHARES IN NON-HHF TRACTS 2000, 2010, AND 2020")
prop.table(table(non.hhf$int_jt_relf_a_2000, useNA = "ifany"))
prop.table(table(non.hhf$int_jt_relf_a_2010, useNA = "ifany"))
prop.table(table(non.hhf$int_jt_relf_a_2020, useNA = "ifany"))

paste("INT JT RELF A IN NON-HHF TRACTS 2000, 2010, AND 2020")
table(non.hhf$int_jt_relf_a_2000, useNA = "ifany")
table(non.hhf$int_jt_relf_a_2010, useNA = "ifany")

paste("% GROWTH FROM 2000 - 2010")
(table(non.hhf$int_jt_relf_a_2010, useNA = "ifany")[2]-
    table(non.hhf$int_jt_relf_a_2000, useNA = "ifany")[2])/
  table(non.hhf$int_jt_relf_a_2000, useNA = "ifany")[2]

table(non.hhf$int_jt_relf_a_2020, useNA = "ifany")

paste("% GROWTH FROM 2010 - 2020")
(table(non.hhf$int_jt_relf_a_2020, useNA = "ifany")[2]-
    table(non.hhf$int_jt_relf_a_2010, useNA = "ifany")[2])/
  table(non.hhf$int_jt_relf_a_2010, useNA = "ifany")[2]

##
## population shares in int neighborhoods
##

paste("SHARE OF 2000 POP IN INT NEIGHBORHOODS")
sum(af.wide.final$pop_total_2000[af.wide.final$int_jt_relf_a_2000==1])/sum(af.wide.final$pop_total_2000)
sum(af.wide.final$pop_nh_asian_2000[af.wide.final$int_jt_relf_a_2000==1])/sum(af.wide.final$pop_nh_asian_2000)
sum(af.wide.final$pop_nh_black_2000[af.wide.final$int_jt_relf_a_2000==1])/sum(af.wide.final$pop_nh_black_2000)
sum(af.wide.final$pop_hl_2000[af.wide.final$int_jt_relf_a_2000==1])/sum(af.wide.final$pop_hl_2000)
sum(af.wide.final$pop_nh_white_2000[af.wide.final$int_jt_relf_a_2000==1])/sum(af.wide.final$pop_nh_white_2000)

paste("SHARE OF 2010 POP IN INT NEIGHBORHOODS")
sum(af.wide.final$pop_total_2010[af.wide.final$int_jt_relf_a_2010==1])/sum(af.wide.final$pop_total_2010)
sum(af.wide.final$pop_nh_asian_2010[af.wide.final$int_jt_relf_a_2010==1])/sum(af.wide.final$pop_nh_asian_2010)
sum(af.wide.final$pop_nh_black_2010[af.wide.final$int_jt_relf_a_2010==1])/sum(af.wide.final$pop_nh_black_2010)
sum(af.wide.final$pop_hl_2010[af.wide.final$int_jt_relf_a_2010==1])/sum(af.wide.final$pop_hl_2010)
sum(af.wide.final$pop_nh_white_2010[af.wide.final$int_jt_relf_a_2010==1])/sum(af.wide.final$pop_nh_white_2010)

paste("SHARE OF 2020 POP IN INT NEIGHBORHOODS")
sum(af.wide.final$pop_total_2020[af.wide.final$int_jt_relf_a_2020==1])/sum(af.wide.final$pop_total_2020)
sum(af.wide.final$pop_nh_asian_2020[af.wide.final$int_jt_relf_a_2020==1])/sum(af.wide.final$pop_nh_asian_2020)
sum(af.wide.final$pop_nh_black_2020[af.wide.final$int_jt_relf_a_2020==1])/sum(af.wide.final$pop_nh_black_2020)
sum(af.wide.final$pop_hl_2020[af.wide.final$int_jt_relf_a_2020==1])/sum(af.wide.final$pop_hl_2020)
sum(af.wide.final$pop_nh_white_2020[af.wide.final$int_jt_relf_a_2020==1])/sum(af.wide.final$pop_nh_white_2020)






### END OF PROGRAM ###

sink()
sink(type = "message")

         